x
V
W
h = sigmoid(c + W x) W c
ˆ
y = b + V h
V b h
x
f
θ
(x) = b + V sigmoid(c + W x),
sigmoid(a) = 1/(1 + e
a
)
x R
n
i
h = sigmoid(c + W x) n
h
θ = (b, c, V , W ) θ
b R
n
o
n
o
c R
n
h
h
V R
n
o
×n
h
W R
n
h
×n
i
L(
ˆ
y, y) = ||
ˆ
y y||
2
ˆ
y
E[Y | x] L
2
||ω||
2
= (
ij
W
2
ij
+
ki
V
2
ki
) ω
W V L
2
λ
J(θ) = λ||ω||
2
+
1
n
n
t=1
||y
(t)
(b + V sigmoid(c + W x
(t)
))||
2
.
(x
(t)
, y
(t)
) t
θ
ω ω
2λω +
ω
L(f
θ
(x
(t)
), y
(t)
)
β β
β
L(f
θ
(x
(t)
), y
(t)
),
β = (b, c) ω = (W , V )
t
n
x
1
, x
2
(h
1
, h
2
)
×
y x
y
f
f
E[||y f(x)||
2
]
f
y x
arg min
fH
E
p(x,y)
[||y f(x)||
2
] = E
p(x,y)
[y|x].
H E
p(x,y)
[y | x].
h
k
= tanh(b
k
+ W
k
h
k1
)
h
0
= x h
k
k > 0
k W
k
b
k
f
θ
(x)
tanh
max(0, b + w · x) max
i
(b
i
+ W
:,i
· x)
a = b + W x
h = φ(a) h
i
= φ(a
i
) = φ(b
i
+ W
i,:
x).
φ(a) = max(0, a)
φ(a) = (a)
+
α
i
a < 0 h
i
= φ(a, α
i
) = max(0, a) + α
i
min(0, a) α
i
α
i
α
i
φ(a) = tanh(a)
φ(a) = 1/(1 + e
a
)
φ(a) = softmax(a) =
e
a
i
/
j
e
a
j
i
φ
i
(a) = 1 φ
i
(a) > 0
tanh(x) = 2 × sigmoid(2x) 1
x
h
i
= exp
−||w
i
x||
2
2
i
σ
i
w
i
= x
(t)
t i
φ(a) = ζ(a) = log(1 + e
a
)
tanh
φ(a) = max(1, min(1, a))
φ(a) = |a|
tanh
w
i
h
i
= max
i
(b
i
+ w
i
· x)
k
k 1
t + 1
t
L(f
θ
(x), y) = ||f
θ
(x) y||
2
f
P (x, y) f(x) = E[y | x = x]
y x
y
y {0, 1}
L(f
θ
(x), y) = y log f
θ
(x) (1 y) log(1 f
θ
(x))
f f(x) = P ( = 1 | x)
f
θ
(x)
L
NLL
(f
θ
(x), y) = log P(y = y | x = x; θ).
P
y x Q
(x, y)
y x
y
x f
θ
(x) σ
2
log P (y | x; θ) =
1
2
(f
θ
(x) y)
2
2
+ log(2πσ
2
).
θ
x
p = f
θ
(x)
= 1 x = x 1 p = 0
L
NLL
= log P ( | x; θ) = 1
=1
log p 1
=0
log(1 p)
= log f
θ
(x) (1 ) log(1 f
θ
(x)).
1
=1
{1, . . . , n}
n 1
p
i
= P ( = i | x) P ( = N | x) 1
n1
i=1
P ( = i | x).
n
p = softmax(a) p
i
=
e
a
i
j
e
a
j
.
a
a = b+W h a
n
n1 a
a
i
i
L
NLL
(p, y) = log p
y
a
y
a
y
y a
i
i = y x
p
y
= 1 x p
i
= i x
a
a
k
L
NLL
(p, y) =
a
k
(log p
y
) =
a
k
(a
y
+ log
j
e
a
j
)
= 1
y=k
+
e
a
k
j
e
a
j
= p
k
1
y=k
or
a
L
NLL
(p, y) = (p e
y
)
e
y
= [0, . . . , 0, 1, 0, . . . , 0] y
x a a
p = E
y
[e
y
| x]
p
i
y = i
p
i
P (y = i | x)
e
y
y y
logp(y = i | x) a p(y = i | x)
i = i
j = i
a
j
L
NLL
(p, y) = p
j
.
= j p
j
0
j p
j
1
a
j
y p
y
0
a
y
a
j
a
j
a
j
a
j
p
j
1 a
j
a
j
p
y
0 a
y
a
y
p
y
1
y=y
= p
y
1 < 0
a
j
j = y
p
j
> 0
y = e
i
= [0, . . . , 0, 1, 0, . . . , 0]
= i y
i
= 1 y
j
= 0, j = i
p = softmax(a) a
a = b + W h h
L
2
(p(a), y) = ||p(a) y||
2
a
a
i
L
2
(p(a), y) =
L
2
(p(a), y)
p(a)
p(a)
a
i
=
j
2(p
j
(a) y
j
)p
j
(1
i=j
p
i
).
= i
p
y
= p
i
0 a
y
a
y
L
2
(p(a), y) 0
softmax(a) = softmax(a + b).
softmax(a) = softmax(a max
i
a
i
).
a
a
i
p(y | ω) ω
p(y | x) ω
x
p(y | ω = f
θ
(x))
f
θ
(x) x y
f
θ
(x)
y ω = f
θ
(x)
y
y
ω
y
x
ω
L(x, y) = log p(y | ω = f
θ
(x))
y
y y x
y x
x x
σ
2
y x x
y f
θ
(x)
σ
σ
2
1
n
n
i=1
(y
(t)
f
θ
(x
(t)
))
2
(t)
t (x
(t)
, y
(t)
)
y d σ
2
σ
σ
2
1
nd
n
i=1
||y
(t)
f
θ
(x
(t)
)||
2
.
σ
2
i
σ
2
i
1
n
n
i=1
(y
(t)
i
f
θ,i
(x
(t)
))
2
.
Σ
1
n
n
i=1
(y
(t)
i
f
θ,i
(x
(t)
))(y
(t)
i
f
θ,i
(x
(t)
))
Σ(x) x
Σ(x)
σ
i
(x) = softplus(g
θ
(x)).
g
θ
(x) x
σ x
ω
σ
i
(x) = softplus(ω
i
)
Σ(x) = B(x)B
(x)
B
O(d
3
)
Σ(x)
B(x)
ω = p( = 1 | x)
ω
n
p(y | x) =
n
i=1
p( = i | x)N(y | µ
i
(x), Σ
i
(x)).
p( = i | x) µ
i
(x) Σ
i
(x)
p( = i | x)
n
n
µ
i
(x) i
y d
n ×d n d
Σ
i
(x)
i x
y y = (y
1
, y
2
, . . . , y
k
)
x = x y
i
p(y
1
, y
2
, . . . , y
k
| x) =
k
i=1
p(y
i
| x).
log p(y
i
| x) p(y
i
| x)
y
i
x
x y
y y
(x, y
i
)
p(y; ω) ω
p(y | x; θ) ω ω = f
θ
(x) θ
y y
i
2
8
= 256
y
i
ω ω
x
J
θ
J θ
θ
J
O(·)
J θ
θ J
y
x
x y
θ
J J
θ x y z
x z y
z = J(g(θ))
x = θ
y = g(θ)
θ
J(g(θ)) =
g(θ)
J(g(θ))
g(θ)
θ
J g θ
θ g(θ)
g(θ)
θ
g(θ)
J(g(θ))
g(θ)
J(g(θ))
θ
g(θ)
θ
g(θ)
g(θ)
J(g(θ)) J(g(θ))
J(g(θ)) θ
z
y x x x
y y
y
x
y(x) z(y) y
z
g
θ
J(g(θ)) =
i
J(g(θ))
g
i
(θ)
g
i
(θ)
θ
y
1
y
2
x z x
z n y
1
y
n
θ J(g(θ))
g
i
(θ) x = θ y
i
= g
i
(θ) z = J(g(θ))
J
J
x
f M
h
k
a
k
W
(k)
f h
k+1
x
h
0
ˆ
y h
M
L(
ˆ
y, y)
ˆ
y y
J
J W b
h
(k)
a
(k)
y
ˆ
y
J
h
0
= x
k = 1 . . . , M
a
(k)
= b
(k)
+ W
(k)
h
(k1)
h
(k)
= f(a
(k)
)
ˆ
y = h
(M)
J = L(
ˆ
y, y) + λ
J
M
b
(k)
W
(k)
f
k h
(k)
i
x y
a
(k)
k
g
ˆ
y
J =
ˆ
y
L(
ˆ
y, y) + λ
ˆ
y
k = M 1
f
g
a
(k)
J = g f
(a
(k)
)
b
(k)
J = g + λ
b
(k)
W
(k)
J = g h
(k1)
+ λ
W
(k)
g
h
(k1)
J = W
(k)
g
a
(k)
i
= b
(k)
i
+
j
W
(k)
ij
h
(k1)
j
h
(k)
i
= f(a
(k)
i
)
φ
θ J u
j
j
J
u
N
u
j
J
u
j
J
u
i
u
i
u
j
J
u
N
=
J
J
= 1
u
i
u
j
u
i
u
j
u
i
f
i
a
(i)
u
j
j < i j parents(i)
x M u
1
u
M
u
N
i = 1 . . . , M
u
i
x
i
i = M + 1 . . . , N
a
(i)
(u
j
)
jparents(i)
u
i
f
i
(a
(i)
)
u
N
u
j
a
i
u
1
f
i
u
N
u
2
u
i
u
i
= f
i
(a
(i)
) a
(i)
u
j
u
i
u
1
. . . , u
M
u
N
J(θ)
u
i
u
j
j < i
J(θ)
u
i
= f
i
(a
(i)
) a
(i)
f
i
u
j
i a
(i)
= (u
j
)
jparents(i)
f
i
(a
(i)
)
a
(i)
f
i
(a
(i)
)
a
ik
a
ik
u
1
u
2
u
3
u
1
u
3
u
3
u
1
u
3
u
2
u
3
u
2
u
2
u
1
u
i
u
i
k = π(i, j) u
j
a
(i)
u
j
u
i
u
i
u
j
f
i
(a
(i)
)
a
ik
f
i
k
u
j
u
i
f
3
(a
3,1
, a
3,2
) = e
a
3,1
+a
3,2
f
2
(a
2,1
) = a
2
2,1
u
3
=
f
3
(u
2
, u
1
) u
2
= f
2
(u
1
) f
3
a
3,2
a
3,1
f
3
a
3,2
= e
a
3,1
+a
3,2
u
3
u
1
u
1
u
3
u
3
u
1
= e
u
1
+u
2
(1 + 2u
1
)
u
3
u
1
u
3
u
1
u
2
u
N
u
N
=1
u
N
u
j
u
N
u
i
j u
j
u
N
u
j
u
N
u
N
u
i
i j
j i
u
N
u
N
= 1
u
N
u
i
=
paths u
k
1
...,u
k
n
: k
1
=i,k
n
=N
n
j=2
u
k
j
u
k
j1
u
k
1
. . . , u
k
n
k
1
= i k
n
= N
u
k
j
u
k
j1
u
N
u
i
π(i, j) u
j
f
i
u
N
u
i
i
u
N
u
1
. . . , u
M
u
N
u
N
1
j = N 1
u
N
u
j
i:jparents(i)
u
N
u
i
f
i
(a
i
)
a
i,π(i,j)
u
N
u
i
M
i=1
u
k
j1
π(k
j
, k
j1
) a
k
j
u
k
j
u
k
j
u
k
j1
=
f
k
j
(a
k
j
)
a
k
j
(k
j
,k
j1
)
.
u
i
u
i
u
N
u
i
j i
f
i
(a
i
)
a
i,π(i,j)
(i)
m n
(i)
d m
d < m d > m
y x
y = module.fprop(x),
x y
J
x
J x
J
y
x
J = module.bprop (
y
J) .
x y
y
x
x
x
y
y
J
x
J =
y
x
y
J.
y
x
µ σ
2
N(µ, σ
2
).
µ σ
2
η N(0, 1)
z = µ + ση
η
µ σ
η
J(z)
µ = f(x; θ) σ = g(x; θ)
θ
J(z)
z p( | ω) ω
z p( | ω)
z = f(ω, η)
η
ω
z
f
z ω
z f
ω
η
f ω
z η
ω z
J
J
ω
η
z
E[J(z)] =
z
J(z)p(z)
E[J(z)]
ω
=
z
J(z)
p(z)
ω
=
z
J(z)p(z)
log p(z)
ω
1
N
N
z
i
p(z), i=1
J(z
i
)
log p(z
i
)
ω
z
f
log p(z)
ω
=
1
p(z)
p(z)
ω
z
z
i
ω p
i
=
p(z
i
= 1|ω) = sigmoid(ω
i
)
log p(z)
ω
i
z
i
(1 p
i
) + (1 z
i
)p
i
z
p(z) ω ω p(z)
x p(z) p(z|x) p(z|ω)
J(z) b(ω)
z
E
p(z)
log p(z)
ω
=
z
p(z)
log p(z)
ω
=
z
p(z)
ω
=
ω
z
p(z) =
ω
1 = 0,
E
p(z)
(J(z) b(ω))
log p(z)
ω
= E
p(z)
J(z)
log p(z)
ω
b(ω)E
p(z)
log p(z)
ω
= E
p(z)
J(z)
log p(z)
ω
.
b(ω) (J(z)
b(ω))
log p(z)
ω
p(z) b(ω)
b
i
(ω) ω
i
ω
b
i
(ω) =
E
p(z)
J(z)
log p(z)
ω
i
2
E
p(z)
log p(z)
ω
i
2
.
ω
i
(J(z) b
i
(ω))
log p(z)
ω
i
b
i
(ω) b
i
(ω)
ω
E
p(z)
[J(z)
log p(z)
ω
i
2
] E
p(z)
log p(z)
ω
i
2
ω
log p(z)
ω
i
2
log p(z)
ω
i
2
z
p(z) ω
i ω J(z)
b(ω) E
p(z)
[J(z)]
b(ω) (J(z)b(ω))
ω
ω
z J(z)
z
R
n
v {0, 1}
n
2
2
n
2
n
O(2
n
)
d
d
n
3 4 5 6 7 8 9 10 11
Number of hidden layers
92.0
92.5
93.0
93.5
94.0
94.5
95.0
95.5
96.0
96.5
Test accuracy (%)
Effect of Depth
0.0 0.2 0.4 0.6 0.8 1.0
Number of parameters
1e8
91
92
93
94
95
96
97
Test accuracy (%)
Effect of Number of Parameters
3, convolutional
3, fully connected
5
11, convolutional
x
x φ(x)
φ(x) f
θ
(x) = b + w ·φ(x)
φ(x)
φ(x)
k(u, v) = φ(u) ·φ(v) ·
φ(·) φ(x)
φ
φ
k(u, v) = exp
−||u v||
2
φ(x)
φ(·)
n
k k
k